# Load packages
library(DESeq2) # ‘1.46.0’
library(dplyr)
library(ggplot2)
library(org.Hs.eg.db) # library(org.Mm.eg.db) for mouse
library(ComplexHeatmap)
library(circlize)   # for color scales
library(grid)       # for gpar()
library(EnhancedVolcano)
# Load count_matrix
args(read.delim) # header = TRUE by default
function (file, header = TRUE, sep = "\t", quote = "\"", dec = ".", 
    fill = TRUE, comment.char = "", ...) 
NULL
rawCounts <- read.delim("http://genomedata.org/gen-viz-workshop/intro_to_deseq2/tutorial/E-GEOD-50760-raw-counts.tsv")
# Format rawCounts as required by DESeq2
# 1. Set gene ID column as rowname
rownames(rawCounts) <- rawCounts$Gene.ID

# DESeq2 expects the rawCounts to contain only raw, unnormalized integer values, with no non-numeric columns.
# 2. Remove columns with non numerical values 
rawCounts_bkup <- rawCounts
rawCounts <- rawCounts[, -c(1, 2)]

class(rawCounts) # a data.frame
[1] "data.frame"
# 3. Convert rawCounts as a matrix
rawCounts <- as.matrix(rawCounts)
# Note: we can do both steps above in one line of code
# rawCounts <- as.matrix(rawCounts[, sapply(rawCounts, is.numeric)])
# checks
is.matrix(rawCounts) # should be TRUE
is.numeric(rawCounts) # should be TRUE
# # Filtering: remove genes with 0 reads in all samples.
dim(rawCounts)
[1] 65217    54
dim(rawCounts[which(rowSums(rawCounts)>0),])
[1] 44316    54
rawCounts <- rawCounts[which(rowSums(rawCounts)>0),]
# Load colData (a dataframe describing samples)
colData <- read.delim("http://genomedata.org/gen-viz-workshop/intro_to_deseq2/tutorial/E-GEOD-50760-experiment-design.tsv")
# Format colData as required by DESeq2
# 1. Set sample ID (here Run column) as rowname
rownames(colData) <- colData$Run

# 2. Remove unnecessary columns
# Tissue type is the primary condition of interest
# Individual IDs are treated as control variables to account for inter-individual variation
keep <- c("Sample.Characteristic.biopsy.site.", "Sample.Characteristic.individual.")
colData <- colData[, keep]

# 3. Rename columns
colnames(colData) <- c("tissue_type", "individual_id")

#  4. Verify that the sample IDs match exactly and are in the same order between rawCounts and colData
all(rownames(colData) == colnames(rawCounts)) # should return TRUE
[1] TRUE
head(colData)
# 5. convert both columns of colData as factors
class(colData$individual_id) # "character"
[1] "character"
colData$individual_id <- factor(colData$individual_id)
class(colData$individual_id) # "factor"
[1] "factor"
colData$tissue_type <- factor(colData$tissue_type)
# DESeq2 uses the first factor level as reference
levels(colData$tissue_type)  # reference = "colorectal cancer metastatic in the liver"
[1] "colorectal cancer metastatic in the liver"
[2] "normal"                                   
[3] "primary tumor"                            
# Comparisons: "normal" and "primary tumor" vs reference
# We want to compare "colorectal cancer metastatic in the liver" and  "primary tumor" against "normal" instead
# 6. Change levels order such that  reference = "normal"
levels(colData$tissue_type) <- c(
  "normal",
  "primary tumor",
  "colorectal cancer metastatic in the liver"
)
levels(colData$tissue_type)
[1] "normal"                                   
[2] "primary tumor"                            
[3] "colorectal cancer metastatic in the liver"
# Create the DEseq2DataSet object
# Design: control variable first, condition of interest second
# DESeq2 adjusts for individual variation before testing tissue_type differences
dds <- DESeqDataSetFromMatrix(countData = rawCounts, colData = colData, design = ~ individual_id + tissue_type)
dds # prints summary of the DESeqDataSet
class: DESeqDataSet 
dim: 44316 54 
metadata(1): version
assays(1): counts
rownames(44316): ENSG00000000003 ENSG00000000005 ... ENSG00000281918
  ENSG00000281920
rowData names(0):
colnames(54): SRR975551 SRR975552 ... SRR975603 SRR975604
colData names(2): tissue_type individual_id
colData(dds)  # shows sample annotations (e.g., tissue_type, individual_id)
DataFrame with 54 rows and 2 columns
                                        tissue_type individual_id
                                           <factor>      <factor>
SRR975551 colorectal cancer metastatic in the liver         AMC_2
SRR975552 colorectal cancer metastatic in the liver         AMC_3
SRR975553 colorectal cancer metastatic in the liver         AMC_5
SRR975554 colorectal cancer metastatic in the liver         AMC_6
SRR975555 colorectal cancer metastatic in the liver         AMC_7
...                                             ...           ...
SRR975600                                    normal        AMC_20
SRR975601                                    normal        AMC_21
SRR975602                                    normal        AMC_22
SRR975603                                    normal        AMC_23
SRR975604                                    normal        AMC_24
counts(dds)[1:5, 1:5]  # view first few genes and samples
                SRR975551 SRR975552 SRR975553 SRR975554 SRR975555
ENSG00000000003      6617      1352      1492      3390      1464
ENSG00000000005        69         1        20        23        12
ENSG00000000419      2798       714       510      1140      1667
ENSG00000000457       486       629       398       239       383
ENSG00000000460       466       342        73       227       193

#  Run the DESeq pipeline
dds <- DESeq(dds)
# Variance Stabilizing Transformation and  plot PCA
vsd <- vst(dds, blind = FALSE)
plotPCA(vsd, intgroup="tissue_type")+ theme_grey()

# Note: contrast = c(factor, numerator, denominator)
# means Compute log2 fold change as numerator / denominator.
# contrast = c("tissue_type", "primary tumor", "normal"): positive values mean higher in tumor, negative mean higher in normal
# contrast = c("tissue_type", "normal", "primary tumor"): positive values mean higher in normal, negative mean higher in tumor
# Extract results
res1 <- results(dds, contrast = c("tissue_type", "primary tumor", "normal"))
res1
log2 fold change (MLE): tissue_type primary tumor vs normal 
Wald test p-value: tissue type primary.tumor vs normal 
DataFrame with 44316 rows and 6 columns
                  baseMean log2FoldChange     lfcSE      stat      pvalue        padj
                 <numeric>      <numeric> <numeric> <numeric>   <numeric>   <numeric>
ENSG00000000003   1824.881      -0.441529  0.180629  -2.44439 1.45095e-02 3.48599e-02
ENSG00000000005     10.852       0.540095  0.435418   1.24040 2.14826e-01 3.30104e-01
ENSG00000000419    725.173      -1.009012  0.135824  -7.42881 1.09583e-13 3.96955e-12
ENSG00000000457    311.358       0.321168  0.113114   2.83934 4.52069e-03 1.26684e-02
ENSG00000000460    126.362      -1.357771  0.211021  -6.43429 1.24048e-10 2.26492e-09
...                    ...            ...       ...       ...         ...         ...
ENSG00000281909  0.2767389      0.3742786  2.936548 0.1274553    0.898580          NA
ENSG00000281910  0.0537801      0.0416978  3.668177 0.0113674    0.990930          NA
ENSG00000281912 10.2974190      0.1583833  0.227051 0.6975657    0.485449    0.614118
ENSG00000281918  0.2310112      0.2723037  3.664070 0.0743173    0.940758          NA
ENSG00000281920  0.2112458      0.0399116  3.666410 0.0108857    0.991315          NA
# Remove rows with NA 
res1 <- na.omit(res1)
res1.df <- as.data.frame(res1)
res1.df
# get gene names for gene IDs
res1.df$symbol <- mapIds(org.Hs.eg.db, keys = rownames(res1.df), keytype = "ENSEMBL", column = "SYMBOL")
res1.df <- res1.df[!is.na(res1.df$symbol),]
res2 <- results(dds, contrast = c("tissue_type", "colorectal cancer metastatic in the liver", "normal"))
res2 <- na.omit(res2)
res2.df <- as.data.frame(res2)
res2.df
res2.df$symbol <- mapIds(org.Hs.eg.db, keys = rownames(res2.df), keytype = "ENSEMBL", column = "SYMBOL")
res2.df <- res2.df[!is.na(res2.df$symbol),]
# Volcano plots 1
EnhancedVolcano(res1.df, x = 'log2FoldChange', y = 'padj', lab = res1.df$symbol,
                pCutoff = 0.05, FCcutoff = 1)

# Volcano plots 2
EnhancedVolcano(res2.df, x = 'log2FoldChange', y = 'padj', lab = res2.df$symbol,
                pCutoff = 0.05, FCcutoff = 1)

# Volcano plots using ggplot2

res1.df$expression <- "NOT"
res1.df$expression[res1.df$padj < 0.01 & res1.df$log2FoldChange > 2] <- "UP"
res1.df$expression[res1.df$padj < 0.01 & res1.df$log2FoldChange < -2] <- "DOWN"

ggplot(data = res1.df, aes(x = log2FoldChange, y = -log10(padj), col = expression)) +
  geom_vline(xintercept = c(-2, 2), col = "gray", linetype = 'dashed') +
  geom_hline(yintercept = -log10(0.01), col = "gray", linetype = 'dashed') +
  geom_point(size = 2) +
  scale_color_manual(values = c("#3498DB", "grey", "#E74C3C"), 
                     labels = c("Downregulated", "Not significant", "Upregulated")) +
  theme_classic()

res2.df$expression <- "NOT"
res2.df$expression[res2.df$padj < 0.01 & res2.df$log2FoldChange > 2] <- "UP"
res2.df$expression[res2.df$padj < 0.01 & res2.df$log2FoldChange < -2] <- "DOWN"

ggplot(data = res2.df, aes(x = log2FoldChange, y = -log10(padj), col = expression)) +
  geom_vline(xintercept = c(-2, 2), col = "gray", linetype = 'dashed') +
  geom_hline(yintercept = -log10(0.01), col = "gray", linetype = 'dashed') +
  geom_point(size = 2) +
  scale_color_manual(values = c("#3498DB", "grey", "#E74C3C"), 
                     labels = c("Downregulated", "Not significant", "Upregulated")) +
  theme_classic()

# Find significant genes
sigs1.df <- res1.df[res1.df$padj < 0.01,]
sigs1.df <- sigs1.df[(abs(sigs1.df$log2FoldChange) > 2),]
sigs1.df <- sigs1.df[sigs1.df$baseMean > 50,] # -  average normalized count for a gene across all samples
sigs1.df
NA
NA
sigs2.df <- res2.df[res2.df$padj < 0.01,]
sigs2.df <- sigs2.df[(abs(sigs2.df$log2FoldChange) > 2),]
sigs2.df <- sigs2.df[sigs2.df$baseMean > 50,] 
sigs2.df
# Combine significant genes from both comparisons
sigs_combined <- rbind(sigs1.df, sigs2.df)
# Get unique gene IDs
unique_genes <- unique(c(rownames(sigs1.df), rownames(sigs2.df)))
# Subset by unique rownames
sigs_combined <- sigs_combined[unique_genes, ]
# Extract normalized counts for those genes
mat <- counts(dds, normalized=T)[rownames(sigs_combined),] # or mat <- assay(vsd)[rownames(sigs_combined), ]
# Z-score transform each gene across samples
mat.z <- t(apply(mat, 1, scale))
colnames(mat.z) <- rownames(colData) 
tissue <- colData$tissue_type

# Color map
tissue_color <- c(
  "primary tumor" = "darkorange",
  "normal" = "forestgreen",
  "colorectal cancer metastatic in the liver" = "firebrick"
)

ha <- HeatmapAnnotation(
  Tissue = tissue,
  col = list(Tissue = tissue_color)
)
# Plot Heatmap
Heatmap(
  mat.z,
  name = "Z-score",
  top_annotation = ha,
  cluster_rows = TRUE,
  cluster_columns = TRUE,
  show_column_names = TRUE,
  show_row_names = FALSE,
  row_labels = sigs_combined[rownames(mat.z), "symbol"],# useful when above is TRUE
  row_names_gp = gpar(fontsize = 6),
  column_names_gp = gpar(fontsize = 6),
  heatmap_legend_param = list(title = "Z-score", legend_direction = "horizontal")
)

# Get top 50 genes from both comparisons
topGenes1 <- head(sigs1.df[order(abs(sigs1.df$log2FoldChange), decreasing = TRUE), ], 50)
topGenes2 <- head(sigs2.df[order(abs(sigs2.df$log2FoldChange), decreasing = TRUE), ], 50)
# Combine both sets
topGenes_combined <- rbind(topGenes1, topGenes2)
# Get unique ENSEMBL IDs
unique_genes <- unique(c(rownames(topGenes1), rownames(topGenes2)))
# Subset by unique rownames
topGenes_combined <- topGenes_combined[unique_genes, ]
# Extract normalized counts for those genes
topmat <- counts(dds, normalized=T)[rownames(topGenes_combined),] 
topmat.z <- t(apply(topmat, 1, scale)) # Z score transformation of genes across samples.
colnames(topmat.z) <- rownames(colData)
Heatmap(
  topmat.z,
  name = "Z-score",
  top_annotation = ha,
  cluster_rows = TRUE,
  cluster_columns = TRUE,
  show_column_names = TRUE,
  show_row_names = TRUE,
  row_labels = topGenes_combined[rownames(topmat.z), "symbol"],
  row_names_gp = gpar(fontsize = 6),
  column_names_gp = gpar(fontsize = 6),
  heatmap_legend_param = list(title = "Z-score", legend_direction = "horizontal")
)

---
title: "R Notebook"
output: html_notebook
---

```{r}
# Load packages
library(DESeq2) # ‘1.46.0’
library(dplyr)
library(ggplot2)
library(org.Hs.eg.db) # library(org.Mm.eg.db) for mouse
library(ComplexHeatmap)
library(circlize)   # for color scales
library(grid)       # for gpar()
library(EnhancedVolcano)
```

```{r}
# Load count_matrix
args(read.delim) # header = TRUE by default
rawCounts <- read.delim("http://genomedata.org/gen-viz-workshop/intro_to_deseq2/tutorial/E-GEOD-50760-raw-counts.tsv")
```

```{r}
# Format rawCounts as required by DESeq2
```

```{r}
# 1. Set gene ID column as rowname
rownames(rawCounts) <- rawCounts$Gene.ID

# DESeq2 expects the rawCounts to contain only raw, unnormalized integer values, with no non-numeric columns.
# 2. Remove columns with non numerical values 
rawCounts_bkup <- rawCounts
rawCounts <- rawCounts[, -c(1, 2)]

class(rawCounts) # a data.frame
# 3. Convert rawCounts as a matrix
rawCounts <- as.matrix(rawCounts)
```
```{r}
# Note: we can do both steps above in one line of code
# rawCounts <- as.matrix(rawCounts[, sapply(rawCounts, is.numeric)])
```

```{r}
# checks
is.matrix(rawCounts) # should be TRUE
is.numeric(rawCounts) # should be TRUE
```

```{r}
# # Filtering: remove genes with 0 reads in all samples.
dim(rawCounts)
dim(rawCounts[which(rowSums(rawCounts)>0),])
rawCounts <- rawCounts[which(rowSums(rawCounts)>0),]
```
```{r}
# Load colData (a dataframe describing samples)
colData <- read.delim("http://genomedata.org/gen-viz-workshop/intro_to_deseq2/tutorial/E-GEOD-50760-experiment-design.tsv")
```

```{r}
# Format colData as required by DESeq2
```

```{r}
# 1. Set sample ID (here Run column) as rowname
rownames(colData) <- colData$Run

# 2. Remove unnecessary columns
# Tissue type is the primary condition of interest
# Individual IDs are treated as control variables to account for inter-individual variation
keep <- c("Sample.Characteristic.biopsy.site.", "Sample.Characteristic.individual.")
colData <- colData[, keep]

# 3. Rename columns
colnames(colData) <- c("tissue_type", "individual_id")

#  4. Verify that the sample IDs match exactly and are in the same order between rawCounts and colData
all(rownames(colData) == colnames(rawCounts)) # should return TRUE

head(colData)
```
```{r}
# 5. convert both columns of colData as factors
class(colData$individual_id) # "character"
colData$individual_id <- factor(colData$individual_id)
class(colData$individual_id) # "factor"
colData$tissue_type <- factor(colData$tissue_type)
```
```{r}
# DESeq2 uses the first factor level as reference
levels(colData$tissue_type)  # reference = "colorectal cancer metastatic in the liver"
# Comparisons: "normal" and "primary tumor" vs reference
```
```{r}
# We want to compare "colorectal cancer metastatic in the liver" and  "primary tumor" against "normal" instead
# 6. Change levels order such that  reference = "normal"
levels(colData$tissue_type) <- c(
  "normal",
  "primary tumor",
  "colorectal cancer metastatic in the liver"
)
levels(colData$tissue_type)
```
```{r}
# Create the DEseq2DataSet object
# Design: control variable first, condition of interest second
# DESeq2 adjusts for individual variation before testing tissue_type differences
dds <- DESeqDataSetFromMatrix(countData = rawCounts, colData = colData, design = ~ individual_id + tissue_type)

```

```{r}
dds # prints summary of the DESeqDataSet

```
```{r}
colData(dds)  # shows sample annotations (e.g., tissue_type, individual_id)

```
```{r}
counts(dds)[1:5, 1:5]  # view first few genes and samples
```
```{r}

#  Run the DESeq pipeline
dds <- DESeq(dds)
```

```{r}
# Variance Stabilizing Transformation and  plot PCA
vsd <- vst(dds, blind = FALSE)
plotPCA(vsd, intgroup="tissue_type")+ theme_grey()
```
```{r}
# Note: contrast = c(factor, numerator, denominator)
# means Compute log2 fold change as numerator / denominator.
# contrast = c("tissue_type", "primary tumor", "normal"): positive values mean higher in tumor, negative mean higher in normal
# contrast = c("tissue_type", "normal", "primary tumor"): positive values mean higher in normal, negative mean higher in tumor
```

```{r}
# Extract results
res1 <- results(dds, contrast = c("tissue_type", "primary tumor", "normal"))
res1
```
```{r}
# Remove rows with NA 
res1 <- na.omit(res1)
res1.df <- as.data.frame(res1)
res1.df
```
```{r}
# get gene names for gene IDs
res1.df$symbol <- mapIds(org.Hs.eg.db, keys = rownames(res1.df), keytype = "ENSEMBL", column = "SYMBOL")
res1.df <- res1.df[!is.na(res1.df$symbol),]

```

```{r}
res2 <- results(dds, contrast = c("tissue_type", "colorectal cancer metastatic in the liver", "normal"))
res2 <- na.omit(res2)
res2.df <- as.data.frame(res2)
res2.df
res2.df$symbol <- mapIds(org.Hs.eg.db, keys = rownames(res2.df), keytype = "ENSEMBL", column = "SYMBOL")
res2.df <- res2.df[!is.na(res2.df$symbol),]
```

```{r}
# Volcano plots 1
EnhancedVolcano(res1.df, x = 'log2FoldChange', y = 'padj', lab = res1.df$symbol,
                pCutoff = 0.05, FCcutoff = 1)

```
```{r}
# Volcano plots 2
EnhancedVolcano(res2.df, x = 'log2FoldChange', y = 'padj', lab = res2.df$symbol,
                pCutoff = 0.05, FCcutoff = 1)
```
```{r}
# Volcano plots using ggplot2

res1.df$expression <- "NOT"
res1.df$expression[res1.df$padj < 0.01 & res1.df$log2FoldChange > 2] <- "UP"
res1.df$expression[res1.df$padj < 0.01 & res1.df$log2FoldChange < -2] <- "DOWN"

ggplot(data = res1.df, aes(x = log2FoldChange, y = -log10(padj), col = expression)) +
  geom_vline(xintercept = c(-2, 2), col = "gray", linetype = 'dashed') +
  geom_hline(yintercept = -log10(0.01), col = "gray", linetype = 'dashed') +
  geom_point(size = 2) +
  scale_color_manual(values = c("#3498DB", "grey", "#E74C3C"), 
                     labels = c("Downregulated", "Not significant", "Upregulated")) +
  theme_classic()
```
```{r}
res2.df$expression <- "NOT"
res2.df$expression[res2.df$padj < 0.01 & res2.df$log2FoldChange > 2] <- "UP"
res2.df$expression[res2.df$padj < 0.01 & res2.df$log2FoldChange < -2] <- "DOWN"

ggplot(data = res2.df, aes(x = log2FoldChange, y = -log10(padj), col = expression)) +
  geom_vline(xintercept = c(-2, 2), col = "gray", linetype = 'dashed') +
  geom_hline(yintercept = -log10(0.01), col = "gray", linetype = 'dashed') +
  geom_point(size = 2) +
  scale_color_manual(values = c("#3498DB", "grey", "#E74C3C"), 
                     labels = c("Downregulated", "Not significant", "Upregulated")) +
  theme_classic()
```

```{r}
# Find significant genes
sigs1.df <- res1.df[res1.df$padj < 0.01,]
sigs1.df <- sigs1.df[(abs(sigs1.df$log2FoldChange) > 2),]
sigs1.df <- sigs1.df[sigs1.df$baseMean > 50,] # -  average normalized count for a gene across all samples
sigs1.df


```

```{r}
sigs2.df <- res2.df[res2.df$padj < 0.01,]
sigs2.df <- sigs2.df[(abs(sigs2.df$log2FoldChange) > 2),]
sigs2.df <- sigs2.df[sigs2.df$baseMean > 50,] 
sigs2.df
```

```{r}
# Combine significant genes from both comparisons
sigs_combined <- rbind(sigs1.df, sigs2.df)
# Get unique gene IDs
unique_genes <- unique(c(rownames(sigs1.df), rownames(sigs2.df)))
# Subset by unique rownames
sigs_combined <- sigs_combined[unique_genes, ]
```

```{r}
# Extract normalized counts for those genes
mat <- counts(dds, normalized=T)[rownames(sigs_combined),] # or mat <- assay(vsd)[rownames(sigs_combined), ]
# Z-score transform each gene across samples
mat.z <- t(apply(mat, 1, scale))
colnames(mat.z) <- rownames(colData) 
```

```{r}
tissue <- colData$tissue_type

# Color map
tissue_color <- c(
  "primary tumor" = "darkorange",
  "normal" = "forestgreen",
  "colorectal cancer metastatic in the liver" = "firebrick"
)

ha <- HeatmapAnnotation(
  Tissue = tissue,
  col = list(Tissue = tissue_color)
)
```

```{r}
# Plot Heatmap
Heatmap(
  mat.z,
  name = "Z-score",
  top_annotation = ha,
  cluster_rows = TRUE,
  cluster_columns = TRUE,
  show_column_names = TRUE,
  show_row_names = FALSE,
  row_labels = sigs_combined[rownames(mat.z), "symbol"],# useful when above is TRUE
  row_names_gp = gpar(fontsize = 6),
  column_names_gp = gpar(fontsize = 6),
  heatmap_legend_param = list(title = "Z-score", legend_direction = "horizontal")
)
```
```{r}
# Get top 50 genes from both comparisons
topGenes1 <- head(sigs1.df[order(abs(sigs1.df$log2FoldChange), decreasing = TRUE), ], 50)
topGenes2 <- head(sigs2.df[order(abs(sigs2.df$log2FoldChange), decreasing = TRUE), ], 50)
```

```{r}
# Combine both sets
topGenes_combined <- rbind(topGenes1, topGenes2)
# Get unique ENSEMBL IDs
unique_genes <- unique(c(rownames(topGenes1), rownames(topGenes2)))
# Subset by unique rownames
topGenes_combined <- topGenes_combined[unique_genes, ]
```

```{r}
# Extract normalized counts for those genes
topmat <- counts(dds, normalized=T)[rownames(topGenes_combined),] 
topmat.z <- t(apply(topmat, 1, scale)) # Z score transformation of genes across samples.
colnames(topmat.z) <- rownames(colData)
```

```{r}
Heatmap(
  topmat.z,
  name = "Z-score",
  top_annotation = ha,
  cluster_rows = TRUE,
  cluster_columns = TRUE,
  show_column_names = TRUE,
  show_row_names = TRUE,
  row_labels = topGenes_combined[rownames(topmat.z), "symbol"],
  row_names_gp = gpar(fontsize = 6),
  column_names_gp = gpar(fontsize = 6),
  heatmap_legend_param = list(title = "Z-score", legend_direction = "horizontal")
)
```

